library(tidyverse)
library(mice)
library(skimr)
library(corrplot)
library(car)
library(ISLR)
library(ggplot2)
library(gridExtra)
library(SamplingStrata)
library(rbin)
library(leaps)

Objective 1:

Question of Interest: what variables are used to predict price of a NYC Airbnb

nycraw <- read.csv("https://raw.githubusercontent.com/JaclynCoate/6372_Project_1/master/AB_NYC_2019.csv", header = TRUE, strip.white=TRUE)
head(nycraw)
##     id                                             name host_id
## 1 2539               Clean & quiet apt home by the park    2787
## 2 2595                            Skylit Midtown Castle    2845
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632
## 4 3831                  Cozy Entire Floor of Brownstone    4869
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322
##     host_name neighbourhood_group neighbourhood latitude longitude
## 1        John            Brooklyn    Kensington 40.64749 -73.97237
## 2    Jennifer           Manhattan       Midtown 40.75362 -73.98377
## 3   Elisabeth           Manhattan        Harlem 40.80902 -73.94190
## 4 LisaRoxanne            Brooklyn  Clinton Hill 40.68514 -73.95976
## 5       Laura           Manhattan   East Harlem 40.79851 -73.94399
## 6       Chris           Manhattan   Murray Hill 40.74767 -73.97500
##         room_type price minimum_nights number_of_reviews last_review
## 1    Private room   149              1                 9  2018-10-19
## 2 Entire home/apt   225              1                45  2019-05-21
## 3    Private room   150              3                 0            
## 4 Entire home/apt    89              1               270  2019-07-05
## 5 Entire home/apt    80             10                 9  2018-11-19
## 6 Entire home/apt   200              3                74  2019-06-22
##   reviews_per_month calculated_host_listings_count availability_365
## 1              0.21                              6              365
## 2              0.38                              2              355
## 3                NA                              1              365
## 4              4.64                              1              194
## 5              0.10                              1                0
## 6              0.59                              1              129
str(nycraw)
## 'data.frame':    48895 obs. of  16 variables:
##  $ id                            : int  2539 2595 3647 3831 5022 5099 5121 5178 5203 5238 ...
##  $ name                          : Factor w/ 47895 levels ""," Private 1 bdrm Lefferts Gr, BK apt",..: 12565 38008 45007 15583 19210 24843 8252 24890 15478 17564 ...
##  $ host_id                       : int  2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
##  $ host_name                     : Factor w/ 11453 levels "","​ Valéria",..: 4997 4791 2913 6210 5929 1938 3549 9649 6880 1235 ...
##  $ neighbourhood_group           : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 3 2 3 3 2 3 3 3 ...
##  $ neighbourhood                 : Factor w/ 221 levels "Allerton","Arden Heights",..: 109 128 95 42 62 138 14 96 203 36 ...
##  $ latitude                      : num  40.6 40.8 40.8 40.7 40.8 ...
##  $ longitude                     : num  -74 -74 -73.9 -74 -73.9 ...
##  $ room_type                     : Factor w/ 3 levels "Entire home/apt",..: 2 1 2 1 1 1 2 2 2 1 ...
##  $ price                         : int  149 225 150 89 80 200 60 79 79 150 ...
##  $ minimum_nights                : int  1 1 3 1 10 3 45 2 2 1 ...
##  $ number_of_reviews             : int  9 45 0 270 9 74 49 430 118 160 ...
##  $ last_review                   : Factor w/ 1765 levels "","2011-03-28",..: 1503 1717 1 1762 1534 1749 1124 1751 1048 1736 ...
##  $ reviews_per_month             : num  0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
##  $ calculated_host_listings_count: int  6 2 1 1 1 1 1 1 1 4 ...
##  $ availability_365              : int  365 355 365 194 0 129 0 220 0 188 ...

EDA to determine type of multiple linear regression to perform

Removing logically irrelevant variables

#Dropping logical irrelevant variables: "id", "name", "host_id", "host_name", "last_reiview", "latitude", "longitude", "neighborhood","availability_365"
nyc2 <- select(nycraw, -c("id", "name", "host_id", "host_name", "last_review", "latitude", "longitude", "neighbourhood","availability_365"))
head(nyc2)
##   neighbourhood_group       room_type price minimum_nights
## 1            Brooklyn    Private room   149              1
## 2           Manhattan Entire home/apt   225              1
## 3           Manhattan    Private room   150              3
## 4            Brooklyn Entire home/apt    89              1
## 5           Manhattan Entire home/apt    80             10
## 6           Manhattan Entire home/apt   200              3
##   number_of_reviews reviews_per_month calculated_host_listings_count
## 1                 9              0.21                              6
## 2                45              0.38                              2
## 3                 0                NA                              1
## 4               270              4.64                              1
## 5                 9              0.10                              1
## 6                74              0.59                              1

Dependent Variable Check

  • Checking on dependent variable range to make sure if there are zero’s to remove. It would not be free to stay in NYC.
nyc2 <- nyc2[!(nyc2$price==0),]
invisible(view(nyc2))

NA Evaluation and Drop

#Checking for NAs
md.pattern(nyc2)

##       neighbourhood_group room_type price minimum_nights number_of_reviews
## 38833                   1         1     1              1                 1
## 10051                   1         1     1              1                 1
##                         0         0     0              0                 0
##       calculated_host_listings_count reviews_per_month      
## 38833                              1                 1     0
## 10051                              1                 0     1
##                                    0             10051 10051
nrow(nyc2)
## [1] 48884
#Drop NAs that are present
nyc3 <- na.omit(nyc2)
#Confirming NA drop
nrow(nyc3)
## [1] 38833

Zero variance variable check - all show variance so remain in model

#Results show no zero variance variables, leave in all
skim(nyc3)
## Skim summary statistics
##  n obs: 38833 
##  n variables: 7 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────
##             variable missing complete     n n_unique
##  neighbourhood_group       0    38833 38833        5
##            room_type       0    38833 38833        3
##                                   top_counts ordered
##  Man: 16632, Bro: 16438, Que: 4574, Bro: 875   FALSE
##      Ent: 20331, Pri: 17658, Sha: 844, NA: 0   FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────
##                        variable missing complete     n   mean     sd p0
##  calculated_host_listings_count       0    38833 38833   5.16  26.3   1
##                  minimum_nights       0    38833 38833   5.87  17.39  1
##               number_of_reviews       0    38833 38833  29.3   48.19  1
##                           price       0    38833 38833 142.35 196.96 10
##  p25 p50 p75  p100     hist
##    1   1   2   327 ▇▁▁▁▁▁▁▁
##    1   2   4  1250 ▇▁▁▁▁▁▁▁
##    3   9  33   629 ▇▁▁▁▁▁▁▁
##   69 101 170 10000 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##           variable missing complete     n mean   sd   p0  p25  p50  p75
##  reviews_per_month       0    38833 38833 1.37 1.68 0.01 0.19 0.72 2.02
##  p100     hist
##  58.5 ▇▁▁▁▁▁▁▁

Storing all categorical variables as factors

#Storing categorical variables as factors
skim(nyc3)
## Skim summary statistics
##  n obs: 38833 
##  n variables: 7 
## 
## ── Variable type:factor ────────────────────────────────────────────────────────
##             variable missing complete     n n_unique
##  neighbourhood_group       0    38833 38833        5
##            room_type       0    38833 38833        3
##                                   top_counts ordered
##  Man: 16632, Bro: 16438, Que: 4574, Bro: 875   FALSE
##      Ent: 20331, Pri: 17658, Sha: 844, NA: 0   FALSE
## 
## ── Variable type:integer ───────────────────────────────────────────────────────
##                        variable missing complete     n   mean     sd p0
##  calculated_host_listings_count       0    38833 38833   5.16  26.3   1
##                  minimum_nights       0    38833 38833   5.87  17.39  1
##               number_of_reviews       0    38833 38833  29.3   48.19  1
##                           price       0    38833 38833 142.35 196.96 10
##  p25 p50 p75  p100     hist
##    1   1   2   327 ▇▁▁▁▁▁▁▁
##    1   2   4  1250 ▇▁▁▁▁▁▁▁
##    3   9  33   629 ▇▁▁▁▁▁▁▁
##   69 101 170 10000 ▇▁▁▁▁▁▁▁
## 
## ── Variable type:numeric ───────────────────────────────────────────────────────
##           variable missing complete     n mean   sd   p0  p25  p50  p75
##  reviews_per_month       0    38833 38833 1.37 1.68 0.01 0.19 0.72 2.02
##  p100     hist
##  58.5 ▇▁▁▁▁▁▁▁

Checking for Multicollinearity

  • Multicollinearity will weaken the model
    • number_of_reviews and reviews_per_month are correlated at 55%
      • Removing reviews_per_month
corrNYC <- nyc3
#Table numeric variables
corrNYCTable <- corrNYC %>% keep(is.numeric) %>% cor %>% view
#Plot numeric variables v numeric variables
corrNYC %>% keep(is.numeric) %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, number.cex = 0.5, method="square", order="hclust", tl.srt=45, tl.cex = 0.8)

invisible(view(corrNYCTable))
#Removing reviews_per_month due to high correlation of is and number_of_reviews
nyc4 <- select(nyc3, -c("reviews_per_month"))

Summary Review of Data Set

summary(nyc4)
##     neighbourhood_group           room_type         price        
##  Bronx        :  875    Entire home/apt:20331   Min.   :   10.0  
##  Brooklyn     :16438    Private room   :17658   1st Qu.:   69.0  
##  Manhattan    :16632    Shared room    :  844   Median :  101.0  
##  Queens       : 4574                            Mean   :  142.4  
##  Staten Island:  314                            3rd Qu.:  170.0  
##                                                 Max.   :10000.0  
##  minimum_nights     number_of_reviews calculated_host_listings_count
##  Min.   :   1.000   Min.   :  1.0     Min.   :  1.000               
##  1st Qu.:   1.000   1st Qu.:  3.0     1st Qu.:  1.000               
##  Median :   2.000   Median :  9.0     Median :  1.000               
##  Mean   :   5.868   Mean   : 29.3     Mean   :  5.165               
##  3rd Qu.:   4.000   3rd Qu.: 33.0     3rd Qu.:  2.000               
##  Max.   :1250.000   Max.   :629.0     Max.   :327.000

Removing outliers from minimum nights stay

  • Anything over 365 is more than a year and would be improbable
  • Removing any minimum nights metric over 365
nyc4 <- nyc4[!(nyc4$minimum_nights > 365),]
invisible(view(nyc4))

Examining VIFs

  • The below results show us there is no need to remove any variables
full.model<-lm(price~.,data=nyc4)  # . means all variable not mpg
vif(full.model)[,3]^2
##            neighbourhood_group                      room_type 
##                       1.010573                       1.017568 
##                 minimum_nights              number_of_reviews 
##                       1.024385                       1.011797 
## calculated_host_listings_count 
##                       1.027540
alias(lm(price~.,data=nyc4))
## Model :
## price ~ neighbourhood_group + room_type + minimum_nights + number_of_reviews + 
##     calculated_host_listings_count

Reviewing Linearity with Numeric Variables

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
#nyc4 %>% pairs() No color model
pairs(nyc4,col=nyc4$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(full.model)

Creating new Log price variable

  • Based on the above plots we may benefit from a transformation
    • Log transforming price to create a log-linear regression
log.nyc <- nyc4 %>% mutate(lprice=log(price))
log.nyc <- select(log.nyc, -c("price"))
invisible(log.nyc)

Examining VIFs of Log Price Variable

log.depend.model<-lm(lprice~.,data=log.nyc)  # . means all variable not mpg
vif(log.depend.model)[,3]^2
##            neighbourhood_group                      room_type 
##                       1.010573                       1.017568 
##                 minimum_nights              number_of_reviews 
##                       1.024385                       1.011797 
## calculated_host_listings_count 
##                       1.027540
alias(lm(lprice~.,data=log.nyc))
## Model :
## lprice ~ neighbourhood_group + room_type + minimum_nights + number_of_reviews + 
##     calculated_host_listings_count

Reviewing Linearity with Independent and Logged Dependent (Price) Variable

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
pairs(log.nyc,col=log.nyc$neighbourhood_group)

par(mfrow=c(2,2))
plot(log.depend.model)

Log-log model

  • Due to lack of linearity trying to transform the independent variables to see if we can surface a linear relationship
log.indep.nyc <- log.nyc %>% mutate(lreviews=log(number_of_reviews))
log.indep.nyc <- log.indep.nyc %>% mutate(lnights=log(minimum_nights))
log.indep.nyc <- log.indep.nyc %>% mutate(llistings=log(calculated_host_listings_count))
invisible(log.indep.nyc)

log.indep.nyc <- select(log.indep.nyc, -c("minimum_nights", "number_of_reviews", "calculated_host_listings_count"))
invisible(log.indep.nyc)

Examining VIFs of Log-Log Model

log.indep.model<-lm(lprice~.,data=log.indep.nyc)  # . means all variable not mpg
vif(log.indep.model)[,3]^2
## neighbourhood_group           room_type            lreviews 
##            1.011796            1.045410            1.040906 
##             lnights           llistings 
##            1.149084            1.083175
alias(lm(lprice~.,data=log.indep.nyc))
## Model :
## lprice ~ neighbourhood_group + room_type + lreviews + lnights + 
##     llistings

Reviewing Linearity with Logged Independent and Dependent Variables

  • Curved relationships with the numeric variables
    • Could require a quadratic or logarithmic transformation
pairs(log.indep.nyc,col=log.indep.nyc$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(log.indep.model)

Continuous Variable Manipulation

  • Since we are seeing large clouds of data but no linear trend with logged and unlogged data, we are going to move forward with binning the data to see if it will assist us in determining if there is a relationship between the continuous variables and log price
nyc5 <- rbin_winsorize(nyc4, price, number_of_reviews, 50, winsor_rate = 0.05)
nyc5
## Binning Summary
## ------------------------------
## Method               Winsorize 
## Response             price 
## Predictor            number_of_reviews 
## Bins                 50 
## Count                38827 
## Goods                0 
## Bads                 0 
## Entropy              NaN 
## Information Value    NaN 
## 
## 
## # A tibble: 50 x 7
##    cut_point bin_count  good   bad   woe    iv entropy
##    <chr>         <int> <int> <int> <dbl> <dbl>   <dbl>
##  1 < 3.56        11224     0     0   NaN   NaN     NaN
##  2 < 6.12         4967     0     0   NaN   NaN     NaN
##  3 < 8.68         2306     0     0   NaN   NaN     NaN
##  4 < 11.24        2544     0     0   NaN   NaN     NaN
##  5 < 13.8         1292     0     0   NaN   NaN     NaN
##  6 < 16.36        1581     0     0   NaN   NaN     NaN
##  7 < 18.92         878     0     0   NaN   NaN     NaN
##  8 < 21.48        1125     0     0   NaN   NaN     NaN
##  9 < 24.04         985     0     0   NaN   NaN     NaN
## 10 < 26.6          612     0     0   NaN   NaN     NaN
## # … with 40 more rows
nyc4 %>% keep(is.numeric) %>% pairs() #[Add color]

par(mfrow=c(2,2))
plot(full.model)

Continuous Variable Bin Manipulation

  • Since we are seeing large clouds of data but no linear trend with logged and unlogged data, we are going to move forward with binning the data to see if it will assist us in determining if there is a relationship between the continuous variables and log price
nyc.bins <- nyc4

nyc.bins$reviewsBin <- var.bin(nyc.bins$number_of_reviews, bins = 50)
nyc.bins$nightsBin <- var.bin(nyc.bins$minimum_nights, bins = 50)
nyc.bins$listBin <- var.bin(nyc.bins$calculated_host_listings_count, bins = 10)

nyc.bins <- select(nyc.bins,-c("minimum_nights", "number_of_reviews", "calculated_host_listings_count"))
invisible(nyc.bins)

Reviewing Linearity with Binned Indepedent Variables

  • No linearity is presenting itself with a binned approach of the independent variables
nyc.bin.model <-lm(price~.,data=nyc.bins)
#nyc.bins  %>% pairs() No color model
pairs(nyc.bins,col=nyc.bins$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(nyc.bin.model)

Explore potential correlation Neighborhood v Price

  • We have to this moment not be able to surface linearity relationships between our numerican independent varaibles and our dependt variable
  • Next we will check for correltaion of the categorical variables: room_type & neighbourhood_group
  • Without removing the ouliers of price above 400 it is near impossible to see if there is a difference per neighborhood. We have removed those prices above 400 to see if the
  • We see a strong chance of correlation between Price and Neighbourhood Group
nyc.categorical <- nyc4[!(nyc4$price > 400),]
nrow(nyc4)
## [1] 38827
nrow(nyc.categorical)
## [1] 37711
plot(nyc.categorical$neighbourhood_group, nyc.categorical$price, xlab = "Neighbourhood Group", ylab = "Price", title = "Price v Neighbourhood Group Correlation Check", col=c(7,32,52,82,107)) 

Explore potential correlation Room Type v Price

  • We see a strong chance of corerlation between Price and Room Type
plot(nyc.categorical$room_type, nyc.categorical$price, xlab = "Room Type", ylab = "Price", title = "Price v Room Type Correlation Check", col=c(7,32,52)) 

Reviewing Linearity with Numeric Variables w/ Price > 400 Outliers Removed

nyc.cat.model<-lm(price~.,data=nyc.categorical) 
#nyc4 %>% pairs() No color model
pairs(nyc.categorical,col=nyc.categorical$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(nyc.cat.model)

Reviewing Linearity with Numeric Variables w/ Log-Linear model Price > 400 Outliers Removed

  • Still no obvious linearity
log.nyc.outliers <- log.nyc[!(log.nyc$lprice > 400),]

log.nyc.outliers.model<-lm(lprice~.,data=log.nyc.outliers) 
pairs(log.nyc.outliers, col=log.nyc.outliers$neighbourhood_group) #Color by neighborhood

par(mfrow=c(2,2))
plot(log.nyc.outliers.model)

Modeling

  • We are not seeing any linear correlation between the dependent and independent numeric varaibles
  • We are seeing a strong chance of linear correlation between the dependent and independent categorical variables
  • We have surfaced the best residuals assumptions matched in a log-linear model
    • Due to this we are moving forward with modeling a log-linear model with singular variables as well as all interaction terms
    • This is to add complexity to our model, we have a low number of varaibles to select from
      • In adding this complexity we are tryign to surface any possible linear variable interations that may contribute to our model
      • If these are surfaced we will go back and use graphical means to verify the model’s discovery
nyc.model = lm(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers)
summary(nyc.model)
## 
## Call:
## lm(formula = lprice ~ neighbourhood_group + room_type + neighbourhood_group:room_type + 
##     minimum_nights + number_of_reviews + calculated_host_listings_count + 
##     minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + 
##     number_of_reviews:calculated_host_listings_count, data = log.nyc.outliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9647 -0.2953 -0.0307  0.2507  5.3317 
## 
## Coefficients:
##                                                          Estimate
## (Intercept)                                             4.684e+00
## neighbourhood_groupBrooklyn                             3.199e-01
## neighbourhood_groupManhattan                            5.898e-01
## neighbourhood_groupQueens                               1.385e-01
## neighbourhood_groupStaten Island                        3.315e-02
## room_typePrivate room                                  -7.051e-01
## room_typeShared room                                   -1.054e+00
## minimum_nights                                         -2.272e-03
## number_of_reviews                                      -9.790e-05
## calculated_host_listings_count                          1.734e-03
## neighbourhood_groupBrooklyn:room_typePrivate room      -1.151e-01
## neighbourhood_groupManhattan:room_typePrivate room     -4.778e-02
## neighbourhood_groupQueens:room_typePrivate room        -9.000e-03
## neighbourhood_groupStaten Island:room_typePrivate room -3.986e-02
## neighbourhood_groupBrooklyn:room_typeShared room       -2.685e-01
## neighbourhood_groupManhattan:room_typeShared room      -1.296e-02
## neighbourhood_groupQueens:room_typeShared room         -4.901e-02
## neighbourhood_groupStaten Island:room_typeShared room  -6.468e-03
## minimum_nights:number_of_reviews                        1.496e-05
## minimum_nights:calculated_host_listings_count          -6.779e-05
## number_of_reviews:calculated_host_listings_count       -4.478e-05
##                                                        Std. Error t value
## (Intercept)                                             2.719e-02 172.289
## neighbourhood_groupBrooklyn                             2.759e-02  11.592
## neighbourhood_groupManhattan                            2.753e-02  21.423
## neighbourhood_groupQueens                               2.939e-02   4.714
## neighbourhood_groupStaten Island                        4.738e-02   0.700
## room_typePrivate room                                   3.417e-02 -20.635
## room_typeShared room                                    7.750e-02 -13.595
## minimum_nights                                          1.994e-04 -11.392
## number_of_reviews                                       6.509e-05  -1.504
## calculated_host_listings_count                          1.745e-04   9.937
## neighbourhood_groupBrooklyn:room_typePrivate room       3.497e-02  -3.292
## neighbourhood_groupManhattan:room_typePrivate room      3.503e-02  -1.364
## neighbourhood_groupQueens:room_typePrivate room         3.718e-02  -0.242
## neighbourhood_groupStaten Island:room_typePrivate room  6.406e-02  -0.622
## neighbourhood_groupBrooklyn:room_typeShared room        8.259e-02  -3.252
## neighbourhood_groupManhattan:room_typeShared room       8.165e-02  -0.159
## neighbourhood_groupQueens:room_typeShared room          8.733e-02  -0.561
## neighbourhood_groupStaten Island:room_typeShared room   2.299e-01  -0.028
## minimum_nights:number_of_reviews                        4.610e-06   3.246
## minimum_nights:calculated_host_listings_count           8.207e-06  -8.259
## number_of_reviews:calculated_host_listings_count        1.802e-05  -2.485
##                                                        Pr(>|t|)    
## (Intercept)                                             < 2e-16 ***
## neighbourhood_groupBrooklyn                             < 2e-16 ***
## neighbourhood_groupManhattan                            < 2e-16 ***
## neighbourhood_groupQueens                              2.44e-06 ***
## neighbourhood_groupStaten Island                       0.484164    
## room_typePrivate room                                   < 2e-16 ***
## room_typeShared room                                    < 2e-16 ***
## minimum_nights                                          < 2e-16 ***
## number_of_reviews                                      0.132618    
## calculated_host_listings_count                          < 2e-16 ***
## neighbourhood_groupBrooklyn:room_typePrivate room      0.000994 ***
## neighbourhood_groupManhattan:room_typePrivate room     0.172578    
## neighbourhood_groupQueens:room_typePrivate room        0.808711    
## neighbourhood_groupStaten Island:room_typePrivate room 0.533767    
## neighbourhood_groupBrooklyn:room_typeShared room       0.001148 ** 
## neighbourhood_groupManhattan:room_typeShared room      0.873856    
## neighbourhood_groupQueens:room_typeShared room         0.574687    
## neighbourhood_groupStaten Island:room_typeShared room  0.977554    
## minimum_nights:number_of_reviews                       0.001171 ** 
## minimum_nights:calculated_host_listings_count           < 2e-16 ***
## number_of_reviews:calculated_host_listings_count       0.012973 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4761 on 38806 degrees of freedom
## Multiple R-squared:  0.4857, Adjusted R-squared:  0.4854 
## F-statistic:  1832 on 20 and 38806 DF,  p-value: < 2.2e-16

Model selection attempts

nyc.fwd = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers, method="forward",nvmax=20)
summary(nyc.fwd)$adjr2
##  [1] 0.3414065 0.4097375 0.4735707 0.4787086 0.4810700 0.4821272 0.4830010
##  [8] 0.4838801 0.4844713 0.4850894 0.4852571 0.4853593 0.4854451 0.4854614
## [15] 0.4854566 0.4854452 0.4854364 0.4854237 0.4854108 0.4853976
summary(nyc.fwd)$rss
##  [1] 11261.364 10092.704  9001.009  8912.930  8872.327  8854.025  8838.858
##  [8]  8823.601  8813.265  8802.473  8799.378  8797.406  8795.712  8795.207
## [15]  8795.062  8795.031  8794.954  8794.944  8794.938  8794.938
summary(nyc.fwd)$bic
##  [1] -16195.91 -20439.42 -24873.62 -25244.87 -25411.58 -25481.19 -25537.19
##  [8] -25593.70 -25628.64 -25665.65 -25668.74 -25666.87 -25663.78 -25655.45
## [15] -25645.52 -25635.09 -25624.86 -25614.34 -25603.80 -25593.23
nyc.fwd2 = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count, data=log.nyc.outliers, method="forward",nvmax=50)
summary(nyc.fwd2)$adjr2
##  [1] 0.3414065 0.4097375 0.4735707 0.4787086 0.4810700 0.4821272 0.4830010
##  [8] 0.4836271 0.4842493 0.4843307 0.4843873 0.4843831 0.4843716 0.4843624
## [15] 0.4843503 0.4843373 0.4843240
summary(nyc.fwd2)$rss
##  [1] 11261.364 10092.704  9001.009  8912.930  8872.327  8854.025  8838.858
##  [8]  8827.925  8817.061  8815.443  8814.247  8814.093  8814.063  8813.992
## [15]  8813.973  8813.967  8813.967
summary(nyc.fwd2)$bic
##  [1] -16195.91 -20439.42 -24873.62 -25244.87 -25411.58 -25481.19 -25537.19
##  [8] -25574.68 -25611.92 -25608.49 -25603.18 -25593.30 -25582.86 -25572.61
## [15] -25562.13 -25551.58 -25541.02
nyc.bck = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers, method="backward",nvmax=20)
summary(nyc.bck)$adjr2
##  [1] 0.3414065 0.4097375 0.4735707 0.4787086 0.4810700 0.4821272 0.4830010
##  [8] 0.4838801 0.4844713 0.4850894 0.4852571 0.4853593 0.4854451 0.4854614
## [15] 0.4854566 0.4854452 0.4854364 0.4854237 0.4854108 0.4853976
summary(nyc.bck)$rss
##  [1] 11261.364 10092.704  9001.009  8912.930  8872.327  8854.025  8838.858
##  [8]  8823.601  8813.265  8802.473  8799.378  8797.406  8795.712  8795.207
## [15]  8795.062  8795.031  8794.954  8794.944  8794.938  8794.938
summary(nyc.bck)$bic
##  [1] -16195.91 -20439.42 -24873.62 -25244.87 -25411.58 -25481.19 -25537.19
##  [8] -25593.70 -25628.64 -25665.65 -25668.74 -25666.87 -25663.78 -25655.45
## [15] -25645.52 -25635.09 -25624.86 -25614.34 -25603.80 -25593.23
nyc.exh = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers, method="exhaustive",nvmax=20)
summary(nyc.exh)$adjr2
##  [1] 0.3414065 0.4097375 0.4735707 0.4787086 0.4810700 0.4821272 0.4830010
##  [8] 0.4838801 0.4844713 0.4850894 0.4852571 0.4853593 0.4854451 0.4854614
## [15] 0.4854566 0.4854452 0.4854364 0.4854237 0.4854108 0.4853976
summary(nyc.exh)$rss
##  [1] 11261.364 10092.704  9001.009  8912.930  8872.327  8854.025  8838.858
##  [8]  8823.601  8813.265  8802.473  8799.378  8797.406  8795.712  8795.207
## [15]  8795.062  8795.031  8794.954  8794.944  8794.938  8794.938
summary(nyc.exh)$bic
##  [1] -16195.91 -20439.42 -24873.62 -25244.87 -25411.58 -25481.19 -25537.19
##  [8] -25593.70 -25628.64 -25665.65 -25668.74 -25666.87 -25663.78 -25655.45
## [15] -25645.52 -25635.09 -25624.86 -25614.34 -25603.80 -25593.23
nyc.seq = regsubsets(lprice~neighbourhood_group + room_type + neighbourhood_group:room_type + minimum_nights + number_of_reviews + calculated_host_listings_count + minimum_nights:number_of_reviews + minimum_nights:calculated_host_listings_count + number_of_reviews:calculated_host_listings_count, data=log.nyc.outliers, method="seqrep",nvmax=20)
summary(nyc.exh)$adjr2
##  [1] 0.3414065 0.4097375 0.4735707 0.4787086 0.4810700 0.4821272 0.4830010
##  [8] 0.4838801 0.4844713 0.4850894 0.4852571 0.4853593 0.4854451 0.4854614
## [15] 0.4854566 0.4854452 0.4854364 0.4854237 0.4854108 0.4853976
summary(nyc.exh)$rss
##  [1] 11261.364 10092.704  9001.009  8912.930  8872.327  8854.025  8838.858
##  [8]  8823.601  8813.265  8802.473  8799.378  8797.406  8795.712  8795.207
## [15]  8795.062  8795.031  8794.954  8794.944  8794.938  8794.938
summary(nyc.exh)$bic
##  [1] -16195.91 -20439.42 -24873.62 -25244.87 -25411.58 -25481.19 -25537.19
##  [8] -25593.70 -25628.64 -25665.65 -25668.74 -25666.87 -25663.78 -25655.45
## [15] -25645.52 -25635.09 -25624.86 -25614.34 -25603.80 -25593.23

Assumptions Check

  • Risduals are normally distributed
    • Log-linear model has closest to normally distrubuted residuals from last plot
    • Near normal distriution of residuals
    • Envoking Central Limit Theorum due to such a large sample size
  • Constant variance
    • Near normal QQ-plot
par(mfrow=c(2,2))
plot(log.nyc.outliers.model)

  • Independence
    • Assumed
  • Multicollinearity
    • Confirmed with VIFs and pairs plot that there is no mjultcollinearity occuring
vif(log.depend.model)[,3]^2
##            neighbourhood_group                      room_type 
##                       1.010573                       1.017568 
##                 minimum_nights              number_of_reviews 
##                       1.024385                       1.011797 
## calculated_host_listings_count 
##                       1.027540
pairs(log.nyc.outliers, col=log.nyc.outliers$neighbourhood_group) #Color by neighborhood

  • Removed all prices over 400 to help reduce data set and uncover any correlation that was present between the dependent varaibles and independent categorical variables
#log.nyc.outliers <- log.nyc[!(log.nyc$lprice > 400),]

MLR May Not Be The Best

  • Multiple linear regression is just one option in building a predictive model for a continuous response
  • We are seeing it as a bad option because
    • The true relationship between the response and predictors is NOT “linear”. The relationships are complex.
      • We have gotten close, but we have worked extremely hard in specifying our model and manipulating the raw data to surface a linear relationship
      • This makes the interpretation into the real world application difficult to interpret
    • Since the above is true and our data is very large, we think that other methods such as Random Forest or K-NN would perform better.
      • These options are less time consuming because the model complexity is built into the lagorithm
      • We also do not have to specify how a relationship exists ahead of time
  • Since we see a strong relationship between the categorical variables, we move forward with a Two-Way ANOVA model to create a model way predict the price of a NYC AirBnB.